import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import json
import pandas as pd
import xml.etree.ElementTree as ET
import time 

# basic_url = "https://bbs.hupu.com/api/v2/nav?url=%2F502"
basic_url = "https://bbs.hupu.com/502"
content_url = "https://bbs.hupu.com"
useragent = UserAgent()

headers = {
    'Accept': '*/*',
    "User-Agent": useragent.random,
    'cookie': 'smidV2=20221011145431dafbe67487be93893fc89df776f7cf6e00031aa623fb2f360; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22183c5d2fbc9c4b-0fd2353c9c7ac4-1a525635-1930176-183c5d2fbca158d%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTgzYzVkMmZiYzljNGItMGZkMjM1M2M5YzdhYzQtMWE1MjU2MzUtMTkzMDE3Ni0xODNjNWQyZmJjYTE1OGQifQ%3D%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%22%2C%22value%22%3A%22%22%7D%2C%22%24device_id%22%3A%22183c5d2fbc9c4b-0fd2353c9c7ac4-1a525635-1930176-183c5d2fbca158d%22%7D; Hm_lvt_4fac77ceccb0cd4ad5ef1be46d740615=1702878413; Hm_lvt_b241fb65ecc2ccf4e7e3b9601c7a50de=1702878413; csrfToken=8oK63cwO53AjOlXKI7JGZsLg; Hm_lvt_df703c1d2273cc30ba452b4c15b16a0d=1704438917; acw_tc=2f624a3417046066457164808e242857841c5534b525054a766ec205429be4; _HUPUSSOID=167b0f54-034d-45d3-a007-babc70c299ee; u=94225435|6JmO5omRSlIxNDY4NzkwNzM1|effa|6bd3d095f15ba78b131d987bdb1422ca|f15ba78b131d987b|aHVwdV84NWRhYjRmNTA5NzIxY2Fj; ua=47350200; _CLT=00376064be821b71351c003dda774e37; us=7d1c7637d2de80549ded4ca208ce1ee4be1c2e23693052e11e8db85b3791f6c7749d4c02f4a454b924145ae710f5c2f730e57ed7e1779a839654bfb4497139f9; Hm_lpvt_df703c1d2273cc30ba452b4c15b16a0d=1704607287; .thumbcache_33f5730e7694fd15728921e201b4826a=lZ995kNZUWJM/9+sVP3928EOBU5dQiBuxpvW/2a100pSlgJ+xmvZ+bMtMLZzNN41UhR3/4oG5rvkkCsGswY8xg%3D%3D'
}

index = 782
pages = 20
# span class="post-user_post-user-comp-info-top-time__k9K2U"
for i in range(17, pages):
    url = basic_url + "-" + str(i + 1)
    print(f"----------开始爬取第 {i} 页 内容----------")
    r = requests.get(url)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, 'html.parser')
    titles = soup.find_all('a', class_ = 'p-title')
    for title in titles:
        href = title.get('href')
        c_url = content_url + href
        c_response = requests.get(c_url)
        c_soup = BeautifulSoup(c_response.text, 'html.parser')
        c_title = c_soup.find('h1', class_ = "index_name__M5qqs").text
        c_datetime =c_soup.find('span', class_ = "post-user_post-user-comp-info-top-time__k9K2U").text
        c_body = c_soup.find('div', class_='thread-content-detail').get_text(strip = True)
        
        # Create xml
        root = ET.Element('doc')
        id_element = ET.SubElement(root, 'id')
        id_element.text = str(index)
        
        title_element = ET.SubElement(root, 'title')
        title_element.text = c_title
        
        url_element = ET.SubElement(root, 'url')
        url_element.text = c_url
        
        datetime_element = ET.SubElement(root, 'datetime')
        datetime_element.text = c_datetime
        
        body_element = ET.SubElement(root, 'body')
        body_element.text = c_body
        
        tree = ET.ElementTree(root)
        
        tree.write(f'hupu/news/new_xml/{index}.xml', encoding='utf-8', xml_declaration=True)
        
        index = index + 1
        
    print(f"截止第 {i} 页获得 {index} 条资讯")
    print("Begin Little Sleeping")
    time.sleep(3)


    